# preamble and load data #### 
rm(list=ls())

library(foreign)
library(survey)
library(xtable)
library(reshape2)
library(plotrix)
library(readstata13)
library(fastDummies)
library(dotwhisker)
library(RColorBrewer)
library(magrittr)
library(estimatr)
library(randomizr)
library(sandwich)
library(tidyverse)

setwd("~/Dropbox/facebook sampling/replication/mexico replication")
paperfigs<-"../figures/"
data_exports<-"../data_exports/"

load("mexico data/mexico_facebook_cleaned.Rda")

# time to complete statistics ####
mexico$Duration..in.seconds.<-as.numeric(as.character(mexico$Duration..in.seconds.))
quantile(mexico$Duration..in.seconds.) ## median response time: 385
quantile(mexico$Duration..in.seconds.)[3]/60
mean(mexico$Duration..in.seconds.)
mexico<-mexico%>%filter(Duration..in.seconds.>=quantile(mexico$Duration..in.seconds.)[3]/3) ## nobody filled out the survey in less than 1/3 median response time. 
incompletes%<>%rename(cell=cell.x) 

# Table 2: check cell targeting accuracy #### 
## these are manually inputted into the table from on the output of sum(diag(table...)) entries below. 
table(mexico$ageFB, mexico$agegroupselfreport)
sum(diag(table(mexico$ageFB,mexico$agegroupselfreport)))/sum(table(mexico$ageFB,mexico$agegroupselfreport))

mexico$genderselfreport <- as.character(mexico$genderselfreport)
table(mexico$genderFB, mexico$genderselfreport)
sum(diag(table(mexico$genderFB, mexico$genderselfreport)))/(sum(table(mexico$genderFB, mexico$genderselfreport))-23-27)


## create lowed category
incompletes%<>%
  mutate(lowed=case_when(
    grepl("9/9/19",StartDate)==TRUE~1,
    edselfreport%in%c("nosecondary","secondary")~1,
    edselfreport=="postsecondary"~0))
mexico%<>%
  mutate(lowed=case_when(
    grepl("9/9/19",StartDate)==TRUE~1,
    edselfreport%in%c("nosecondary","secondary")~1,
    edselfreport=="postsecondary"~0))

mexico.lowed<-mexico[grep("9/9/19",mexico$StartDate)[1]:nrow(mexico),] ## target by date fielded because we only used this targeting in the last part of our recruitment 
mexico.lowed%<>%
  mutate(edmatch2=ifelse(education%in%c("Ninguno","Primaria","Secundaria")|grepl("Bachillerato",education),1,0)) ## 1 indicates that self-reported education is highschool (bachillerato), technical/ professional school, but not university
prop.table(table(mexico.lowed$edmatch2))

table(mexico$geoFB, mexico$geoselfreport)
sum(diag(table(mexico$geoFB, mexico$geoselfreport)))/sum(table(mexico$geoFB, mexico$geoselfreport))

table(mexico$regionFB, mexico$regionselfreport)
sum(diag(table(mexico$regionFB, mexico$regionselfreport)))/sum(table(mexico$regionFB, mexico$regionselfreport))

## flag for accurate quota
mexico%<>%
  mutate(ageFB=gsub("to","",ageFB))%>%
  mutate(matchcells=ifelse(geoFB==geoselfreport & 
                             genderFB==genderselfreport & 
                             ageFB==agegroupselfreport,1,0))
table(mexico$matchcells)
sum(mexico$matchcells)/nrow(mexico)

## change values for ageFB in incompletes dataset to match those in the main dataset
incompletes%<>%mutate(ageFB=gsub("to","",ageFB))


### count number of people in each cell #### 
cellfill<-mexico%>%group_by(cell)%>%
  summarise(n=n())%>%
  arrange(desc(n))
cellfill
quantile(cellfill$n,probs=c(0.25,0.5,0.75))
range(cellfill$n)



# non-response bias: #### 
## compare facebook demographics for people who did and didn't complete the survey #### 
## overall attrition rate reported in paper: 
nrow(incompletes)/(nrow(incompletes)+nrow(mexico)) ## 46%

## Fig. 3: Predictors of non-response #### 
## figure is created in separate script, the code below outputs the regression table underlying the figure

ninrtab<-bind_rows(mexico%>%select(all_of(names(incompletes))),
                   incompletes%>%mutate(Duration..in.seconds.=as.numeric(Duration..in.seconds.)))%>%
  select(-ResponseId,-Duration..in.seconds.,-geoFB,-edselfreport)%>%### this is now the dataframe of ad clickers 
  mutate(lowed=as.character(lowed))

# ninr.n<-nrow(ninrtab)
ninr.n<-ninrtab%>%
  select(-cell)%>%
  pivot_longer(cols=-complete,names_to="variable")%>%
  group_by(variable)%>%
  count(value)%>%
  rename(ninr.n=n)
ninrtab2<-ninrtab%>% ## could leave "geoFB" in if wanted to focus on micro-regions, but there are lots of municipalities and 4 regions makes the table tractable. 
  pivot_longer(cols=-complete,names_to="variable")%>%
  group_by(variable,complete)%>%
  count(value)%>%
  left_join(ninr.n,by=c("variable","value"))%>%
  mutate(percent=n*100/ninr.n)%>%
  select(-n,-ninr.n)%>%
  arrange(variable,value,complete)%>%
  pivot_wider(id_cols=c("variable","value"),names_from="complete",
              values_from="percent")%>%
  rename("Non-respondents"="incomplete",
         "Respondents"="complete")%>%
  mutate(variable=gsub("FB","",variable))%>%
  select(variable,value,Respondents,`Non-respondents`)

ninrtab%<>%mutate(attrition=ifelse(complete=="incomplete",1,0),
                  ageFB=relevel(factor(ageFB),ref="1829"))
fit<-lm_robust(attrition~genderFB+ageFB,data=ninrtab)
fit<-broom::tidy(fit)
write_csv(fit,file=paste0(data_exports,"fig3_attritionreg_mexico.csv"))
    


## load Americas barometer 2016/17 data
ab<-read.dta13('mexico data/Mexico LAPOP AmericasBarometer 2019 v1.0_W.dta',nonint.factors = TRUE)
sort(names(ab))
class(ab$env1c)
mean(ab$env1c,na.rm=T) ## 4.077316; previously had been 3.909627
# install.packages('plotrix')
plotrix::std.error(ab$env1c,na.rm=T)
mean(mexico$env1c2)

# figure 5: policy preferences #### 
## figure is created in another script. this code creates the data underlying the figure
## create binary variables for each level of env1c
ab2<-ab%>%
  filter(!is.na(env1c))
# n<-nrow(ab2)

ab2<-ab2%>%select(env1c)%>%fastDummies::dummy_cols()%>%
  select(-env1c)
ab2<-svydesign(ids=~0,data=ab2)
vars<-names(ab2$variables)
ab2<-data.frame(svymean(make.formula(vars),ab2,na.rm=TRUE))
ab2$var<-rownames(ab2)
ab2<-ab2%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(ab2)[1:2]<-c("estimate","std.error")

fb2<-mexico%>%
  select(env1c2)%>%
  fastDummies::dummy_cols()%>%
  select(-env1c2)
fb2$weight<-mexico$weight
fb2<-svydesign(ids=~0,data=fb2,weights=fb2$weight)
vars<-names(fb2$variables)
fb2<-data.frame(svymean(make.formula(vars),fb2,na.rm=TRUE))
fb2$var<-rownames(fb2)
fb2<-fb2%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(fb2)[1:2]<-c("estimate","std.error")

## unweighted 
fb3<-mexico%>%
  select(env1c2)%>%
  fastDummies::dummy_cols()%>%
  select(-env1c2)
fb3<-svydesign(ids=~0,data=fb3)
vars<-names(fb3$variables)
fb3<-data.frame(svymean(make.formula(vars),fb3,na.rm=TRUE))
fb3$var<-rownames(fb3)
fb3<-fb3%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(fb3)[1:2]<-c("estimate","std.error")


fb2$model<-"Facebook (weighted)"
ab2$model<-"LAPOP"
fb3$model<-"Facebook (unweighted)"
policy2<-do.call(rbind,list(fb2,ab2,fb3))
# mycol<-RColorBrewer::brewer.pal(8,"Paired")[c(2,8)]
write_csv(policy2,paste0(data_exports,"fig4_policy_mexico.csv"))

# create data for figure 2 #### 
## figure is created in a separate script; this code creates the data underlying the figure
marriage<-data.frame(read.csv('mexico data/mexico_marriage_national.csv'))
marriage<-melt(marriage)
names(marriage)<-c("term","estimate")
marriage$std.error<-0
marriage$model<-"Census"
marriage$var<-"marital.status"
marriage$term<-tolower(marriage$term)
marriage$term<-factor(marriage$term,levels=c("soltera","casada","en.unión.libre",
                                                                 "separada","divorciada","viuda","no.especificado"),
                                labels=c("Single","Married","Civil union","Separated","Divorced","Widowed","Unspecified"))
marriage<-marriage%>%filter(term!="Unspecified")
marriage_fb<-mexico%>%
  select(marital.status)%>%
  mutate(marital.status=as.character(marital.status),
                marital.status=substr(marital.status,5,nchar(marital.status)),
                marital.status=stringr::str_replace_all(marital.status, "[^[:alnum:]]", ""),
                marital.status=ifelse(substring(marital.status,nchar(marital.status))=="a",
                                      paste0(substr(marital.status,1,nchar(marital.status)-2),"a"),
                                      marital.status))
marriage_fb$marital.status<-factor(marriage_fb$marital.status,levels=c("soltera","casada","consuparejaenuniónlibre",
                                                                       "separada","divorciada","viuda"),
                                   labels=c("Single","Married","Civilunion","Separated","Divorced","Widowed"))
marriage_fb<-marriage_fb%>%
  fastDummies::dummy_cols()%>%
  select(-marital.status)
marriage_fb<-svydesign(ids=~0,data=marriage_fb)
vars<-names(marriage_fb$variables)
marriage_fb<-data.frame(svymean(make.formula(vars),marriage_fb,na.rm=TRUE))
marriage_fb$var<-rownames(marriage_fb)
marriage_fb<-marriage_fb%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(marriage_fb)[1:2]<-c("estimate","std.error")
marriage_fb$model<-"Facebook Sample"
marriage<-rbind(marriage,marriage_fb)


# ## religion 
 
religion_fb<-data.frame(round(prop.table(table(mexico$religion))*100,1))
religions<-data.frame(religion_fb$Var1)
religions$religion.recode<-c("None","Catholic",
                             "NonCatholic","NonCatholic",
                             "NonCatholic","None",
                             "dk","NonCatholic",
                             "NonCatholic","NonCatholic",
                             "NonCatholic")
religion_fb<-left_join(mexico,religions,by=c("religion"="religion_fb.Var1"))%>%
  select(religion.recode)%>%
  fastDummies::dummy_cols()%>%
  select(-religion.recode)
religion_fb<-svydesign(ids=~0,data=religion_fb)
vars<-names(religion_fb$variables)
religion_fb<-data.frame(svymean(make.formula(vars),religion_fb,na.rm=TRUE))
religion_fb$var<-rownames(religion_fb)
religion_fb<-religion_fb%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(religion_fb)[1:2]<-c("estimate","std.error")
religion_fb$model<-"Facebook Sample"
religion_fb$term<-recode(religion_fb$term,"NonCatholic"="Non-Catholic","None"="No religion")

catholic<-data.frame(read.csv("mexico data/Religion_catholic.csv",header=FALSE,stringsAsFactors = FALSE))
noncatholic<-data.frame(read.csv("mexico data/Religion_otherthancatholic.csv",header=FALSE,stringsAsFactors = FALSE))
noreligion<-data.frame(read.csv("mexico data/Religion_none.csv",header=FALSE,stringsAsFactors = FALSE))
religion<-do.call(rbind,list(catholic,noncatholic,noreligion))
religion$total<-sum(religion$V2)

religion<-religion%>%
  dplyr::mutate(estimate=round(V2/total,3)*100)%>%
  dplyr::select(V1,estimate)
religion$std.error<-0
names(religion)[1]<-"term"
religion$var<-"religion.recode"

religion$model<-"Census"
religion<-rbind(religion_fb,religion)

religion<-religion%>%filter(term=="Catholic"|term=="Non-Catholic"|term=="No religion")

# ## education 
load("mexico data/weightingdata_mexico.Rda")
edtotal<-sum(ed$Freq)
education<-ed%>%
  mutate(edselfreport=case_when(edselfreport=="nosecondary"~"Did not complete secondary school",
                                edselfreport=="secondary"~"Secondary degree",
                                edselfreport=="postsecondary"~"College degree or higher"))%>%
  group_by(edselfreport)%>%
  summarise(Freq=sum(Freq))%>%
  ungroup()%>%
  rename(term=edselfreport)%>%
  mutate(estimate=Freq/edtotal,
         std.error=0,
         model="Census",
         var="education")%>%
  select(-Freq)

education_fb<-mexico%>%
  select(edselfreport)%>%
  fastDummies::dummy_cols()%>%
  select(-edselfreport)
education_fb<-svydesign(ids=~0,data=education_fb)
vars<-names(education_fb$variables)
education_fb<-data.frame(svymean(make.formula(vars),education_fb,na.rm=TRUE))
education_fb$var<-rownames(education_fb)
education_fb<-education_fb%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")%>%
  mutate(term=case_when(term=="nosecondary"~"Did not complete secondary school",
                        term=="postsecondary"~"College degree or higher",
                        term=="secondary"~"Secondary degree"))%>%
  filter(!is.na(term))
names(education_fb)[1:2]<-c("estimate","std.error")
education_fb$model<-"Facebook Sample"


education<-rbind(education_fb,education)
education<-filter(education,term!="none")

census<-do.call(rbind,list(religion,marriage,education))
## Add Lapop sample
## find demographics in americas barometer survey.  
## ed is education
## q1 is gender 
## q2 is age
## q3cn is religion
## q11n is marital status

religion_ab<-ab%>%select(q3cn)%>%
  mutate(q3cn=case_when(q3cn==levels(ab$q3cn)[1]~"Catholic",
                        q3cn%in%levels(ab$q3cn)[c(2,3,5,6,8)]==TRUE~"Noncatholic",
                        q3cn%in%levels(ab$q3cn)[c(4,7)]==TRUE~"None"))%>%
  fastDummies::dummy_cols()%>%
  select(-q3cn)
religion_ab<-svydesign(ids=~0,data=religion_ab)
vars<-names(religion_ab$variables)
religion_ab<-data.frame(svymean(make.formula(vars),religion_ab,na.rm=TRUE))
religion_ab$var<-rownames(religion_ab)
religion_ab<-religion_ab%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(religion_ab)[1:2]<-c("estimate","std.error")
religion_ab$model<-"LAPOP"
religion_ab
religion_ab<-mutate(religion_ab,term=case_when(
  term=="Noncatholic"~"Non-Catholic",
  term=="None"~"No religion",
  TRUE~term
))%>%
  filter(term!="NA")

marriage_ab<-ab%>%
  select(q11n)%>%
  mutate(q11n=case_when(q11n=="Soltero"~"Single",
                        q11n=="Casado"~"Married",
                        q11n=="Divorciado"~"Divorced"))%>%
  filter(!is.na(q11n))%>%
  fastDummies::dummy_cols()%>%
  select(-q11n)
marriage_ab<-svydesign(ids=~0,data=marriage_ab)
vars<-names(marriage_ab$variables)
marriage_ab<-data.frame(svymean(make.formula(vars),marriage_ab,na.rm=TRUE))
marriage_ab$var<-rownames(marriage_ab)
marriage_ab<-marriage_ab%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(marriage_ab)[1:2]<-c("estimate","std.error")
marriage_ab$model<-"LAPOP"
marriage_ab

education_ab<-ab%>%select(ed)%>%
  mutate(ed=case_when(ed<9~"nosecondary", # less than 3rd year of secondary school
                      ed>=9&ed<16~"secondary", # 3rd year of secondary school through 3rd year college
                      ed>=16&ed<=18~"postsecondary"))%>% ## 4th year of college + 
  fastDummies::dummy_cols()%>%
  select(-ed)
education_ab<-svydesign(ids=~0,data=education_ab)
vars<-names(education_ab$variables)
education_ab<-data.frame(svymean(make.formula(vars),education_ab,na.rm=TRUE))
education_ab$var<-rownames(education_ab)
education_ab<-education_ab%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")%>%
  mutate(term=case_when(term=="nosecondary"~"Did not complete secondary school",
                        term=="secondary"~"Secondary degree",
                        term=="postsecondary"~"College degree or higher"))
names(education_ab)[1:2]<-c("estimate","std.error")
education_ab$model<-"LAPOP"
education_ab
  

census<-do.call(rbind,list(census,religion_ab,education_ab,marriage_ab))

## add basic demos: age and gender
ab<-ab%>%
  mutate(age=case_when(q2>=18&q2<=29~'1829',
                       q2>29&q2<=49~"3049",
                       q2>49&q2<=59~"5059",
                       q2>59~"6065"),
         gender=ifelse(q1=="Hombre","M","F"))
ab_age<-ab%>%select(age)%>%
  fastDummies::dummy_cols()%>%
  select(-age)
ab_age<-svydesign(ids=~0,data=ab_age)
vars<-names(ab_age$variables)
ab_age<-data.frame(svymean(make.formula(vars),ab_age,na.rm=TRUE))
ab_age$var<-rownames(ab_age)
ab_age<-ab_age%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(ab_age)[1:2]<-c("estimate","std.error")
ab_age$model<-"LAPOP"
ab_age
  

ab_gender<-ab%>%select(gender)%>%
  fastDummies::dummy_cols()%>%
  select(-gender)
ab_gender<-svydesign(ids=~0,data=ab_gender)
vars<-names(ab_gender$variables)
ab_gender<-data.frame(svymean(make.formula(vars),ab_gender,na.rm=TRUE))
ab_gender$var<-rownames(ab_gender)
ab_gender<-ab_gender%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(ab_gender)[1:2]<-c("estimate","std.error")
ab_gender$model<-"LAPOP"
ab_gender

fb_gender<-mexico%>%select(genderselfreport)%>%
  fastDummies::dummy_cols()%>%
  select(-genderselfreport)
fb_gender<-svydesign(ids=~0,data=fb_gender)
vars<-names(fb_gender$variables)
fb_gender<-data.frame(svymean(make.formula(vars),fb_gender,na.rm=TRUE))
fb_gender$var<-rownames(fb_gender)
fb_gender<-fb_gender%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(fb_gender)[1:2]<-c("estimate","std.error")
fb_gender$model<-"Facebook Sample"
fb_gender

fb_age<-mexico%>%select(agegroupselfreport)%>%
  fastDummies::dummy_cols()%>%
  select(-agegroupselfreport)
fb_age<-svydesign(ids=~0,data=fb_age)
vars<-names(fb_age$variables)
fb_age<-data.frame(svymean(make.formula(vars),fb_age,na.rm=TRUE))
fb_age$var<-rownames(fb_age)
fb_age<-fb_age%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(fb_age)[1:2]<-c("estimate","std.error")
fb_age$model<-"Facebook Sample"
fb_age

### Ad clicker demos #### 
adclickers_age<-ninrtab%>%select(ageFB)%>%
  fastDummies::dummy_cols()%>%
  select(-ageFB)
adclickers_age<-svydesign(ids=~0,data=adclickers_age)
vars<-names(adclickers_age$variables)
adclickers_age<-data.frame(svymean(make.formula(vars),adclickers_age,na.rm=TRUE))
adclickers_age$var<-rownames(adclickers_age)
adclickers_age<-adclickers_age%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(adclickers_age)[1:2]<-c("estimate","std.error")
adclickers_age$model<-"Facebook (ad clickers)"
adclickers_age

adclickers_gender<-ninrtab%>%select(genderFB)%>%
  fastDummies::dummy_cols()%>%
  select(-genderFB)
adclickers_gender<-svydesign(ids=~0,data=adclickers_gender)
vars<-names(adclickers_gender$variables)
adclickers_gender<-data.frame(svymean(make.formula(vars),adclickers_gender,na.rm=TRUE))
adclickers_gender$var<-rownames(adclickers_gender)
adclickers_gender<-adclickers_gender%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(adclickers_gender)[1:2]<-c("estimate","std.error")
adclickers_gender$model<-"Facebook (ad clickers)"
adclickers_gender


gentotal<-sum(gender$Freq)
gender%<>%
  mutate(estimate=round(Freq/gentotal,1),
         std.error=0,
         var="gender",
         model="Census")%>%
  select(-Freq)%>%
  rename(term=genderselfreport)

agetotal<-sum(age$Freq)
age%<>%
  mutate(estimate=round(Freq/agetotal,1),
         std.error=0,
         var="age",
         model="Census")%>%
  select(-Freq)%>%
  rename(term=agegroupselfreport)

census<-do.call(rbind,list(census,ab_age,ab_gender,fb_age,fb_gender,adclickers_age,adclickers_gender,age,gender))

## read in facebook population data 
fb.pop<-read.csv("mexico data/mexico_facebook_population.csv",stringsAsFactors = FALSE)
fb.summary<-do.call(rbind,list(fb.pop%>%
                     group_by(genders)%>%
                     summarise(estimate=sum(mau_audience))%>%
                       rename(term=genders)%>%
                       mutate(var="gender"),
                   fb.pop%>%
                     group_by(age_range_min)%>%
                     summarise(estimate=sum(mau_audience))%>%
                     rename(term=age_range_min)%>%
                     mutate(var="age"),
                   fb.pop%>%
                     group_by(education_level)%>%
                     summarise(estimate=sum(mau_audience))%>%
                     rename(term=education_level)%>%
                     mutate(var="education")))%>%
  filter(term!="all")
fb.summary<-fb.summary%>%
  mutate(estimate=case_when(
    var=="gender"~estimate/sum(fb.summary[var=="gender","estimate"]),
    var=="age"~estimate/sum(fb.summary[var=="age","estimate"]),
    var=="education"~estimate/sum(fb.summary[var=="education","estimate"])
  ))%>%
  mutate(term=case_when(term=="College grad"|term=="Doctorate degree"|term=="Some grad school"|term=="In grad school"|term=="Masters degree"~"College degree or higher",
                        term=="Associate degree"|term=="Some college"|term=="High school grad"|term=="In college"|term=="Professional degree"~"Secondary degree",
                        term=="In high school"|term=="Some high school"~"Did not complete secondary school",
                        TRUE~term))%>%
  group_by(term,var)%>%
  summarise(estimate=sum(estimate))%>%
  ungroup()%>%
  mutate(std.error=0,
         model="Facebook Population")%>%
  arrange(var)
  
census<-rbind(fb.summary,census)

census$term<-recode(census$term,"M"="Male","F"="Female","6065"="60+","5059"="50-59",
                    "3049"="30-49","1829"="18-29","18"="18-29","30"="30-49","50"="50-59","60"="60+",
                    "female"="Female","male"="Male","Unspecified"="Unspecified education")
census<-census%>%
  filter(term!="Civil union"&term!="Civilunion"&term!="Widowed"&term!="Separated")%>%
  mutate(estimate=ifelse(model=="Census"&estimate>1,estimate/100,estimate))
census%<>%filter(!is.na(term))
write.csv(census,file=paste0(data_exports,"fig2_demos_mexico_unadjusted.csv"))


#### compare weighted samples  ####
marriage%<>%filter(model=="Census")
  
marriage_fb_weighted<-mexico%>%
  select(marital.status)%>%
  mutate(marital.status=as.character(marital.status),
         marital.status=substr(marital.status,5,nchar(marital.status)),
         marital.status=stringr::str_replace_all(marital.status, "[^[:alnum:]]", ""),
         marital.status=ifelse(substring(marital.status,nchar(marital.status))=="a",
                               paste0(substr(marital.status,1,nchar(marital.status)-2),"a"),
                               marital.status))
marriage_fb_weighted$marital.status<-factor(marriage_fb_weighted$marital.status,levels=c("soltera","casada","consuparejaenuniónlibre",
                                                                       "separada","divorciada","viuda"),
                                   labels=c("Single","Married","Civilunion","Separated","Divorced","Widowed"))
marriage_fb_weighted<-marriage_fb_weighted%>%
  fastDummies::dummy_cols()%>%
  select(-marital.status)
marriage_fb_weighted<-svydesign(ids=~0,data=marriage_fb_weighted,weights=mexico$weight)
vars<-names(marriage_fb_weighted$variables)
marriage_fb_weighted<-data.frame(svymean(make.formula(vars),marriage_fb_weighted,na.rm=TRUE))
marriage_fb_weighted$var<-rownames(marriage_fb_weighted)
marriage_fb_weighted<-marriage_fb_weighted%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(marriage_fb_weighted)[1:2]<-c("estimate","std.error")
marriage_fb_weighted$model<-"Facebook Sample"
marriage<-rbind(marriage,marriage_fb_weighted)


# ## religion

religion_fb_weighted<-left_join(mexico,religions,by=c("religion"="religion_fb.Var1"))%>%
  select(religion.recode)%>%
  fastDummies::dummy_cols()%>%
  select(-religion.recode)
religion_fb_weighted<-svydesign(ids=~0,data=religion_fb_weighted,weights=mexico$weight)
vars<-names(religion_fb_weighted$variables)
religion_fb_weighted<-data.frame(svymean(make.formula(vars),religion_fb_weighted,na.rm=TRUE))
religion_fb_weighted$var<-rownames(religion_fb_weighted)
religion_fb_weighted<-religion_fb_weighted%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")
names(religion_fb_weighted)[1:2]<-c("estimate","std.error")
religion_fb_weighted$model<-"Facebook Sample"
religion_fb_weighted$term<-recode(religion_fb_weighted$term,"NonCatholic"="Non-Catholic","None"="No religion")

religion%<>%filter(model=="Census")

religion<-rbind(religion_fb_weighted,religion)

religion<-religion%>%filter(term=="Catholic"|term=="Non-Catholic"|term=="No religion")

# ## education
education%<>%filter(model=="Census")



education_fb_weighted<-mexico%>%
  select(edselfreport)%>%
  fastDummies::dummy_cols()%>%
  select(-edselfreport)
education_fb_weighted<-svydesign(ids=~0,data=education_fb_weighted,weights=mexico$weight)
vars<-names(education_fb_weighted$variables)
education_fb_weighted<-data.frame(svymean(make.formula(vars),education_fb_weighted,na.rm=TRUE))
education_fb_weighted$var<-rownames(education_fb_weighted)
education_fb_weighted<-education_fb_weighted%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(var!="weight")%>%  mutate(term=case_when(term=="nosecondary"~"Did not complete secondary school",
                                                  term=="postsecondary"~"College degree or higher",
                                                  term=="secondary"~"Secondary degree"))
names(education_fb_weighted)[1:2]<-c("estimate","std.error")
education_fb_weighted$model<-"Facebook Sample"


education<-rbind(education_fb_weighted,education)

census_weighted<-do.call(rbind,list(religion,marriage,education))

census_weighted<-do.call(rbind,list(census_weighted,religion_ab,education_ab,marriage_ab))

fb_gender<-mexico%>%select(genderselfreport)%>%
  fastDummies::dummy_cols()%>%
  select(-genderselfreport)
fb_gender<-svydesign(ids=~0,data=fb_gender,weights=mexico$weight)
vars<-names(fb_gender$variables)
fb_gender<-data.frame(svymean(make.formula(vars),fb_gender,na.rm=TRUE))
fb_gender$var<-rownames(fb_gender)
fb_gender<-fb_gender%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(fb_gender)[1:2]<-c("estimate","std.error")
fb_gender$model<-"Facebook Sample"
fb_gender

fb_age<-mexico%>%select(agegroupselfreport)%>%
  fastDummies::dummy_cols()%>%
  select(-agegroupselfreport)
fb_age<-svydesign(ids=~0,data=fb_age,weights=mexico$weight)
vars<-names(fb_age$variables)
fb_age<-data.frame(svymean(make.formula(vars),fb_age,na.rm=TRUE))
fb_age$var<-rownames(fb_age)
fb_age<-fb_age%>%
  separate(var,into=c("var","term"),sep="_",remove=TRUE)%>%
  filter(term!="NA")
names(fb_age)[1:2]<-c("estimate","std.error")
fb_age$model<-"Facebook Sample"
fb_age

census_weighted<-do.call(rbind,list(fb.summary,census_weighted,ab_age,ab_gender,fb_age,fb_gender,age,gender))
census_weighted$term<-recode(census_weighted$term,"M"="Male","F"="Female","6065"="60+","5059"="50-59",
                    "3049"="30-49","1829"="18-29","18"="18-29","30"="30-49","50"="50-59","60"="60+",
                    "female"="Female","male"="Male","Unspecified"="Unspecified education")


unique(census_weighted$term)
census_weighted<-census_weighted%>%
  filter(term!="Civil union"&term!="Civilunion"&term!="Widowed"&term!="Separated")%>%
  mutate(estimate=ifelse(model=="Census"&estimate>1,estimate/100,estimate))
census_weighted%<>%filter(!is.na(term),term!="NA")
write_csv(census_weighted,paste0(data_exports,"fig2_demos_mexico_weighted.csv"))


# cost analysis ####
centro<-read.csv("mexico data/mexico_adsets/Yale-Survey-Research-Lab-Ad-Sets-Lifetime (4).csv",stringsAsFactors = FALSE)
other<-read.csv("mexico data/mexico_adsets/Yale-Survey-Research-Lab-Ad-Sets-Lifetime (5).csv",stringsAsFactors = FALSE)
centro2<-read.csv("mexico data/mexico_adsets/Yale-Survey-Research-Lab-Ad-Sets-Lifetime (7).csv",stringsAsFactors = FALSE)
other2<-read.csv("mexico data/mexico_adsets/Yale-Survey-Research-Lab-Ad-Sets-Lifetime (6).csv",stringsAsFactors = FALSE)

unique(other2$Ad.Set.Name)

full<-do.call(rbind,list(centro,centro2,other,other2))
sort(unique(full$Ad.Set.Name))

full$cell<-tolower(full$Ad.Set.Name)
full<-full%>%
  mutate(cell=gsub("centro 1a","centro 1",cell),
         cell=gsub("centro 1b","centro 1",cell),
         cell=gsub("occidente 1a","occidente 1",cell),
         cell=gsub("occidente 1b","occidente 1",cell),
         cell=gsub("norte 1a","norte 1",cell),
         cell=gsub("norte 1b","norte 1",cell),
         cell=gsub("sur 1a","sur 1",cell),
         cell=gsub("sur 1b","sur 1",cell),
         cell=gsub("sur 1c","sur 1",cell),
         cell=gsub("sur 1d","sur 1",cell),
         cell=gsub("sur 1e","sur 1",cell))
## collapse cells that were combined
# full[grep("CENTRO 1",full$cell),"cell"]<-"CENTRO 1"
# full[grep("SUR 1",full$cell),"cell"]<-"SUR 1"
# full[grep("NORTE 1",full$cell),"cell"]<-"NORTE 1"
# full[grep("OCCIDENTE 1",full$cell),"cell"]<-"OCCIDENTE 1"
full[full$cell=="","cell"]<-"total"
length(unique(full$cell))


full.sum<-full%>%
  group_by(cell)%>%
  summarise(reach=sum(Reach,na.rm=TRUE),
            impressions=sum(Impressions,na.rm=TRUE),
            results=sum(Results,na.rm=TRUE),
            spent=sum(Amount.Spent..USD.,na.rm=TRUE),
            clicks=sum(Unique.Link.Clicks,na.rm=TRUE))%>%
  mutate(clickthroughrate=clicks/impressions,
         completerate=results/impressions,
         costperclick=spent/clicks,
         costpercomplete=spent/results)%>%
  filter(cell!="total")

## Table S4: stats on reach, impressions, clicks, results, and spending ####
mexico_summary<-full.sum%>%
  summarise(
    Impressions=sum(impressions,na.rm=TRUE),
    Reach=sum(reach,na.rm=TRUE),
    Clicks=sum(clicks,na.rm=TRUE),
    Survey_results=nrow(mexico),
    Total_spent=sum(spent,na.rm=TRUE),
    Click_through_rate=Clicks/Impressions,
    Completion_rate=Survey_results/Impressions,
    Cost_per_click=Total_spent/Clicks,
    Cost_per_complete=Total_spent/Survey_results
  )
print(data.frame(mexico_summary))


## table S.5: click-through, completion, and cost across targeting cells #### 
summary(full.sum$clickthroughrate)*100
summary(full.sum$completerate)*100
summary(full.sum$costperclick)
summary(full.sum$costpercomplete)

## Table S.6, stats for initial sample and low-ed oversample #### 
## initial sample first: 
initial<-rbind(centro,other)

initial$cell<-tolower(initial$Ad.Set.Name)
initial<-initial%>%
  mutate(cell=gsub("centro 1a","centro 1",cell),
         cell=gsub("centro 1b","centro 1",cell),
         cell=gsub("occidente 1a","occidente 1",cell),
         cell=gsub("occidente 1b","occidente 1",cell),
         cell=gsub("norte 1a","norte 1",cell),
         cell=gsub("norte 1b","norte 1",cell),
         cell=gsub("sur 1a","sur 1",cell),
         cell=gsub("sur 1b","sur 1",cell),
         cell=gsub("sur 1c","sur 1",cell),
         cell=gsub("sur 1d","sur 1",cell),
         cell=gsub("sur 1e","sur 1",cell))
initial[initial$cell=="","cell"]<-"total"

initial.sum<-initial%>%
  group_by(cell)%>%
  summarise(reach=sum(Reach,na.rm=TRUE),
            impressions=sum(Impressions,na.rm=TRUE),
            results=sum(Results,na.rm=TRUE),
            spent=sum(Amount.Spent..USD.,na.rm=TRUE),
            clicks=sum(Unique.Link.Clicks,na.rm=TRUE))%>%
  mutate(clickthroughrate=clicks/impressions,
         completerate=results/impressions,
         costperclick=spent/clicks,
         costpercomplete=spent/results)%>%
  filter(cell!="total")%>%
  arrange(costpercomplete)

## stats for paper: Table S6
sum(initial.sum$reach)
sum(initial.sum$impressions)
sum(initial.sum$clicks)
sum(initial.sum$results)
sum(initial.sum$spent)

## Table S7
summary(initial.sum$clickthroughrate) * 100
summary(initial.sum$completerate)*100
summary(initial.sum$costperclick)
summary(initial.sum$costpercomplete)


## low education oversample next ####
over<-rbind(centro2,other2)

over$cell<-tolower(over$Ad.Set.Name)
over<-over%>%
  mutate(cell=gsub("centro 1a","centro 1",cell),
         cell=gsub("centro 1b","centro 1",cell),
         cell=gsub("occidente 1a","occidente 1",cell),
         cell=gsub("occidente 1b","occidente 1",cell),
         cell=gsub("norte 1a","norte 1",cell),
         cell=gsub("norte 1b","norte 1",cell),
         cell=gsub("sur 1a","sur 1",cell),
         cell=gsub("sur 1b","sur 1",cell),
         cell=gsub("sur 1c","sur 1",cell),
         cell=gsub("sur 1d","sur 1",cell),
         cell=gsub("sur 1e","sur 1",cell))
over[over$cell=="","cell"]<-"total"

over.sum<-over%>%
  group_by(cell)%>%
  summarise(reach=sum(Reach,na.rm=TRUE),
            impressions=sum(Impressions,na.rm=TRUE),
            results=sum(Results,na.rm=TRUE),
            spent=sum(Amount.Spent..USD.,na.rm=TRUE),
            clicks=sum(Unique.Link.Clicks,na.rm=TRUE))%>%
  mutate(clickthroughrate=clicks/impressions,
         completerate=results/impressions,
         costperclick=spent/clicks,
         costpercomplete=spent/results)%>%
  filter(cell!="total")%>%
  arrange(desc(costpercomplete))

## stats for paper: table S6
sum(over.sum$reach)
sum(over.sum$impressions)
sum(over.sum$clicks)
sum(over.sum$results)
sum(over.sum$spent)

## table S7
summary(over.sum$clickthroughrate)*100
summary(over.sum$completerate)*100
summary(over.sum$costperclick)
summary(over.sum$costpercomplete)


## cost by respondent type: Table S8
## this tells us the amount we spent advertising to each group
## divided by the number of people we recruited from each group 
## (defined as complete responses)
w<-full%>%
  filter(grepl("female",cell)==TRUE)%>%
  filter(cell!="total")%>%
  group_by(cell)%>%
  summarise(spent=sum(Amount.Spent..USD.,na.rm=TRUE))
## spend on ads targeting women divided by # of women respondents (self-reported)
sum(w$spent)/nrow(mexico[which(mexico$genderselfreport=="F"),])

m<-full%>%
  filter(grepl("female",cell)==FALSE)%>%
  filter(cell!="total")%>%
  group_by(cell)%>%
  summarise(spent=sum(Amount.Spent..USD.,na.rm=TRUE))
## spend on ads targeting men divided by # of men respondents (self-reported)
sum(m$spent)/nrow(mexico[which(mexico$genderselfreport=="M"),])


urban<-rbind(full[grep("3a",full$cell),],
             full[grep("4a",full$cell),])%>%
  filter(cell!="total")%>%
  group_by(cell)%>%
  summarise(spent=sum(Amount.Spent..USD.,na.rm=TRUE))
mexico.urban<-mexico%>%filter(grepl("3a",geoselfreport)==TRUE|
                                grepl("4a",geoselfreport)==TRUE)
sum(urban$spent)/nrow(mexico.urban)


rural<-rbind(full[grep(" 1 ",full$cell),],
             full[grep("2a",full$cell),])%>%
  filter(cell!="total")%>%
  group_by(cell)%>%
  summarise(spent=sum(Amount.Spent..USD., na.rm=TRUE))
## subset dataset to self-reports in the rural places 
mexico.rural<-mexico%>%filter(grepl("1",geoselfreport)==TRUE|
                                grepl("2a",geoselfreport)==TRUE)
## ratio of ad spend to these places to complete responses from these places 
sum(rural$spent)/nrow(mexico.rural) ## 0.19

